1 Introduction

Here, we will transfer the label and UMAP coordinates from the discovery cohort to the validation cohort. We will do it one compartment at a time (e.g. gcbc), so we can validate that the transfer is robust

Specifically, we will follow these steps for every compartment:

  1. Include or update the annotation label for this object.
  2. Plot the integrated object split by type (quey/reference) to visualize if discovery and validation cohorts were integrated properly.
  3. Transfer label.
  4. Transfer UMAP coordinates + visualize
  5. Exclude cells with a low annotation probability and/or a high doublet score
  6. Visualize heatmap of specific markers to validate that the cluster and annotation holds for the new cohort.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(scCustomize)
library(tidyverse)
library(caret)
library(class)
library(harmony)
library(here)
library(glue)
library(DT)
library(pheatmap)

2.2 Source functions

source(here("scRNA-seq/bin/SLOcatoR_functions.R"))
source(here("scRNA-seq/bin/utils.R"))
source(here("scRNA-seq/bin/utils_final_clusters.R"))
color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", "#5A5156", "#AA0DFE",
                    "#F8A19F", "#F7E1A0", "#1C8356", "#FEAF16", "#822E1C", "#C4451C",
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", "#FBE426", "#16FF32",
                    "black",   "#3283FE", "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB", "#822E1C", "coral2",
                    "#1CFFCE", "#1CBE4F", "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD", "#FEAF16", "#683B79",
                    "#B10DA1", "#1C7F93", "#F8A19F", "dark orange", "#FEAF16", "#FBE426",
                    "Brown")

3 GCBC

# Read data, clean and include annotation
gcbc <- readRDS(here("scRNA-seq/results/R_objects/seurat_objects_revision/4.11-GCBC_seurat_object_discovery_validation_cohorts.rds"))

We proceed to update the annotation:

gcbc <- updateAnnotation(gcbc)
gcbc$annotation_20230508 <- gcbc$annotation_20220619
gcbc$annotation_20230508[is.na(gcbc$annotation_20230508)] <- "unannotated"
gcbc$annotation_20230508 <- factor(
  gcbc$annotation_20230508,
  levels = c(names(colors_20230508[["GCBC"]]), "unannotated")
)
Idents(gcbc) <- "annotation_20230508"
gcbc$is_reference <- ifelse(
  gcbc$cohort_type == "discovery",
  "reference",
  "query"
)
DimPlot(
  gcbc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = colors_20230508[["GCBC"]]
)

Let us explore if there are lingering doublets:

Idents(gcbc) <- "cohort_type"
FeaturePlot(gcbc, c("CD3D", "LYZ")) &
  scale_color_viridis_c(option = "magma")

FeaturePlot(gcbc[, gcbc$cohort_type == "validation"], "doublet_score_scDblFinder") &
  scale_color_viridis_c(option = "magma")

Indeed, one cluster is composed of doublets between B and T cells. Let us find it and remove it:

gcbc <- FindNeighbors(gcbc, dims = 1:25, reduction = "harmony")
gcbc <- FindClusters(gcbc, resolution = 1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 93201
## Number of edges: 2614021
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8454
## Number of communities: 20
## Elapsed time: 36 seconds
DimPlot(gcbc, cols = rev(color_palette))

markers16 <- FindMarkers(gcbc, ident.1 = "16", only.pos = TRUE, logfc.threshold = 0.5)
head(markers16, 30)
##                   p_val avg_log2FC pct.1 pct.2     p_val_adj
## CD2        0.000000e+00  0.6856149 0.539 0.022  0.000000e+00
## TXNIP      0.000000e+00  0.9780004 0.847 0.292  0.000000e+00
## AAK1       0.000000e+00  0.5630971 0.495 0.040  0.000000e+00
## IL7R       0.000000e+00  0.9389565 0.628 0.018  0.000000e+00
## FYB1       0.000000e+00  0.9690267 0.742 0.046  0.000000e+00
## TCF7       0.000000e+00  0.5165520 0.471 0.061  0.000000e+00
## TRBC1      0.000000e+00  0.6907910 0.453 0.040  0.000000e+00
## TRBC2      0.000000e+00  0.6762832 0.643 0.190  0.000000e+00
## GIMAP7     0.000000e+00  0.8974861 0.657 0.033  0.000000e+00
## GIMAP4     0.000000e+00  0.6752352 0.521 0.027  0.000000e+00
## GIMAP5     0.000000e+00  0.5149683 0.420 0.015  0.000000e+00
## LINC00861  0.000000e+00  0.5514859 0.433 0.010  0.000000e+00
## CD3E       0.000000e+00  0.7747475 0.604 0.028  0.000000e+00
## CD3D       0.000000e+00  0.8821539 0.649 0.044  0.000000e+00
## CD3G       0.000000e+00  0.8096641 0.624 0.016  0.000000e+00
## BCL11B     0.000000e+00  0.6868253 0.567 0.014  0.000000e+00
## IL32       0.000000e+00  0.7945482 0.583 0.059  0.000000e+00
## SLFN5      0.000000e+00  0.5222730 0.502 0.085  0.000000e+00
## CCR7       0.000000e+00  0.5163333 0.443 0.041  0.000000e+00
## CD7        0.000000e+00  0.5152501 0.409 0.014  0.000000e+00
## SARAF     1.196002e-279  0.5336398 0.632 0.236 4.470415e-275
## SRGN      8.029466e-194  0.5098707 0.609 0.290 3.001254e-189
gcbc <- gcbc[, !(gcbc$seurat_clusters == "16" & gcbc$cohort_type == "validation")]
DimPlot(gcbc, cols = rev(color_palette), split.by = "cohort_type")

Reintegrate:

gcbc$type <- str_c(gcbc$cohort_type, gcbc$assay, sep = "_")
shared_hvg <- find_assay_specific_features(gcbc, assay_var = "type")
gcbc <- integrate_assays(
  gcbc,
  assay_var = "type",
  shared_hvg = shared_hvg
)
gcbc <- RunUMAP(gcbc, dims = 1:25, reduction = "harmony")
DimPlot(
  gcbc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = c(colors_20230508[["GCBC"]], "unannotated" = "gray")
)  &
  theme(legend.text = element_text(size = 6))

Transfer label:

# Split training and test sets
data_sets <- split_training_and_test_sets(
  gcbc,
  split_var = "cohort_type",
  referece_label = "discovery",
  query_label = "validation",
  reduction = "harmony",
  n_dims = 25
)


# Transfer label
annotation_query_df <- transfer_label(
  seurat_obj = gcbc,
  training_set = data_sets$training_set,
  test_set = data_sets$test_set,
  k = 30,
  response_var = "annotation_20230508"
)

# Include annotation and UMAP coords in Seurat object
gcbc$annotation_20230508[annotation_query_df$query_cells] <- annotation_query_df$annotation
Idents(gcbc) <- "annotation_20230508"
gcbc$annotation_20230508_probability <- NA
gcbc$annotation_20230508_probability[annotation_query_df$query_cells] <- annotation_query_df$annotation_prob
p1 <- FeaturePlot(
  gcbc[, annotation_query_df$query_cells],
  "annotation_20230508_probability"
) + scale_color_viridis_c(option = "magma")
p2 <- FeaturePlot(
  gcbc[, annotation_query_df$query_cells],
  "doublet_score_scDblFinder"
) + scale_color_viridis_c(option = "magma")
p1 | p2

DimPlot(
  gcbc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = colors_20230508[["GCBC"]]
)

table(gcbc$annotation_20230508_probability)
## 
## 0.233333333333333 0.266666666666667               0.3  0.32258064516129 0.333333333333333 0.354838709677419 0.366666666666667 0.387096774193548               0.4           0.40625 0.419354838709677 0.433333333333333 0.451612903225806 0.466666666666667 0.483870967741935               0.5 0.516129032258065 0.533333333333333 0.548387096774194 0.566666666666667 0.580645161290323               0.6 0.612903225806452 0.633333333333333 0.645161290322581 0.666666666666667  0.67741935483871               0.7 0.709677419354839 0.733333333333333 0.741935483870968 0.766666666666667 0.774193548387097           0.78125               0.8 0.806451612903226            0.8125 0.833333333333333 0.838709677419355 0.866666666666667 0.870967741935484             0.875               0.9 0.903225806451613 0.933333333333333 0.935483870967742 0.966666666666667 0.967741935483871           0.96875                 1 
##                 4                 6                23                 1                37                 3                84                 2               122                 1                 4               195                 5               265                 6               454                 6               566                14               597                 8               592                12               589                12               612                16               591                16               607                14               669                18                 1               693                15                 1               795                19               944                16                 1              1089                29              1469                37              2229                55                 2              6498

Plot dot plot

goi_rna <- c("CXCR4", "AICDA", "FOXP1", "MME", "CD83", "LMO2", "POLA1", "HIST1H4C",
             "MKI67", "TOP2A", "CDC20", "CCNB1", "STMN1", "HMGB2", "CD40", "BCL2A1",
             "MIR155HG", "EBI3", "TRAF1", "NFKB1", "NFKB2", "RELB", "MYC", "BATF",
             "CCR6", "CELF2", "BANK1", "PRDM1", "XBP1")
avgexpr_mat <- AverageExpression(
  gcbc[, gcbc$cohort_type == "validation"],
  features = goi_rna,
  assays = "RNA",
  return.seurat = FALSE,
  group.by = "annotation_20230508",
  slot = "data"
)$RNA


# Define annotation column
cell_types <- levels(gcbc$annotation_20230508)
mycolors <- list(cell_type = c(colors_20230508$GCBC))
annotation_col <- data.frame(cell_type = cell_types) 
rownames(annotation_col) <- cell_types


# Scale per row between 0 and 1 
input_mat <- t(apply(avgexpr_mat, 1, function(x) (x - min(x)) / diff(range(x))))


# Plot heatmap
pheatmap(
  t(input_mat),
  annotation_row = annotation_col,
  annotation_colors = mycolors,
  annotation_names_row = FALSE,
  annotation_legend = FALSE,
  show_rownames = FALSE,
  show_colnames = TRUE, 
  border_color = NA,
  cluster_rows = FALSE,
  cluster_cols = FALSE,
  gaps_col = c(1, 3, 8),
  fontsize_row = 6
)

4 Save

saveRDS(gcbc, here("scRNA-seq/results/R_objects/seurat_objects_revision/5.11-GCBC_seurat_object_discovery_validation_cohorts_annotation.rds"))

5 Session Information

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /home/groups/singlecell/rmassoni/anaconda3/envs/richter/lib/libopenblasp-r0.3.21.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] pheatmap_1.0.12    DT_0.27            glue_1.6.2         here_1.0.1         harmony_0.1.1      Rcpp_1.0.10        class_7.3-21       caret_6.0-94       lattice_0.20-45    forcats_1.0.0      stringr_1.5.0      dplyr_1.1.0        purrr_1.0.1        readr_2.1.4        tidyr_1.3.0        tibble_3.1.8       ggplot2_3.4.1      tidyverse_1.3.2    scCustomize_1.1.1  SeuratObject_4.1.3 Seurat_4.3.0       BiocStyle_2.26.0  
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3             spatstat.explore_3.0-6 reticulate_1.26        tidyselect_1.2.0       htmlwidgets_1.6.1      grid_4.2.2             Rtsne_0.16             pROC_1.18.0            munsell_0.5.0          codetools_0.2-19       ica_1.0-3              future_1.31.0          miniUI_0.1.1.1         withr_2.5.0            spatstat.random_3.1-3  colorspace_2.1-0       progressr_0.13.0       highr_0.10             knitr_1.42             stats4_4.2.2           ROCR_1.0-11            tensor_1.5             listenv_0.9.0          labeling_0.4.2         polyclip_1.10-4        farver_2.1.1           rprojroot_2.0.3        parallelly_1.34.0      vctrs_0.5.2            generics_0.1.3         ipred_0.9-13           xfun_0.37              timechange_0.2.0       R6_2.5.1               ggbeeswarm_0.7.1       spatstat.utils_3.0-1   cachem_1.0.6           assertthat_0.2.1       promises_1.2.0.1       scales_1.2.1           nnet_7.3-18            googlesheets4_1.0.1    beeswarm_0.4.0         gtable_0.3.1           globals_0.16.2         goftest_1.2-3          timeDate_4022.108      rlang_1.0.6            GlobalOptions_0.1.2    splines_4.2.2          lazyeval_0.2.2        
##  [52] ModelMetrics_1.2.2.2   gargle_1.3.0           spatstat.geom_3.0-6    broom_1.0.3            BiocManager_1.30.19    yaml_2.3.7             reshape2_1.4.4         abind_1.4-5            modelr_0.1.10          backports_1.4.1        httpuv_1.6.9           tools_4.2.2            lava_1.7.1             bookdown_0.33          ellipsis_0.3.2         jquerylib_0.1.4        RColorBrewer_1.1-3     ggridges_0.5.4         plyr_1.8.8             rpart_4.1.19           deldir_1.0-6           pbapply_1.7-0          cowplot_1.1.1          zoo_1.8-11             haven_2.5.1            ggrepel_0.9.3          cluster_2.1.4          fs_1.6.1               magrittr_2.0.3         data.table_1.14.6      scattermore_0.8        circlize_0.4.15        lmtest_0.9-40          reprex_2.0.2           RANN_2.6.1             googledrive_2.0.0      fitdistrplus_1.1-8     matrixStats_0.63.0     hms_1.1.2              patchwork_1.1.2        mime_0.12              evaluate_0.20          xtable_1.8-4           readxl_1.4.2           gridExtra_2.3          shape_1.4.6            compiler_4.2.2         KernSmooth_2.23-20     crayon_1.5.2           htmltools_0.5.4        later_1.3.0           
## [103] tzdb_0.3.0             ggprism_1.0.4          lubridate_1.9.1        DBI_1.1.3              dbplyr_2.3.0           MASS_7.3-58.2          Matrix_1.5-3           cli_3.6.0              parallel_4.2.2         gower_1.0.1            igraph_1.3.5           pkgconfig_2.0.3        sp_1.6-0               plotly_4.10.1          spatstat.sparse_3.0-0  recipes_1.0.4          xml2_1.3.3             paletteer_1.5.0        foreach_1.5.2          vipor_0.4.5            bslib_0.4.2            hardhat_1.2.0          prodlim_2019.11.13     rvest_1.0.3            snakecase_0.11.0       digest_0.6.31          sctransform_0.3.5      RcppAnnoy_0.0.20       janitor_2.2.0          spatstat.data_3.0-0    rmarkdown_2.20         cellranger_1.1.0       leiden_0.4.3           uwot_0.1.14            shiny_1.7.4            lifecycle_1.0.3        nlme_3.1-162           jsonlite_1.8.4         limma_3.54.0           viridisLite_0.4.1      fansi_1.0.4            pillar_1.8.1           ggrastr_1.0.1          fastmap_1.1.0          httr_1.4.4             survival_3.5-3         png_0.1-8              iterators_1.0.14       stringi_1.7.12         sass_0.4.5             rematch2_2.1.2        
## [154] irlba_2.3.5.1          future.apply_1.10.0